INTRO
Importamos las librerías a utilizar
require(dplyr)
Loading required package: dplyr
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
require(ggplot2)
Loading required package: ggplot2
require(tsibble)
Loading required package: tsibble
Warning: package ‘tsibble’ was built under R version 4.2.2
Attaching package: ‘tsibble’
The following objects are masked from ‘package:base’:
intersect, setdiff, union
require(lubridate)
Loading required package: lubridate
Attaching package: ‘lubridate’
The following object is masked from ‘package:tsibble’:
interval
The following objects are masked from ‘package:base’:
date, intersect, setdiff, union
library(prophet)
Warning: package ‘prophet’ was built under R version 4.2.2Loading required package: Rcpp
Loading required package: rlang
EDA
Importamos los datasets con los que se va a trabajar:
df_abc <- read.csv("./datasets/202208_PAX15min-ABC.csv", sep=";")
df_deh <- read.csv("./datasets/202208_PAX15min-DEH.csv", sep=";")
df <- bind_rows(df_abc, df_deh)
rm(df_abc, df_deh)
print(colnames(df))
[1] "FECHA" "DESDE" "HASTA" "LINEA" "MOLINETE" "ESTACION" "pax_pagos" "pax_pases_pagos" "pax_franq"
[10] "pax_TOTAL"
Reviso datasets anteriores:
df_2022 <- read.csv("./datasets/molinetes_2022.csv", sep=",")
print(colnames(df_2022))
[1] "FECHA" "DESDE" "HASTA" "LINEA"
[5] "MOLINETE" "ESTACION" "pax_pagos" "pax_pases_pagos"
[9] "pax_franq" "pax_TOTAL"
df_2022 <- df_2022 %>%
mutate(ts = dmy_hms(paste0(FECHA," " ,DESDE)))
df_2021 <- read.csv("./datasets/molinetes_2021.csv", sep=";")
print(colnames(df_2021))
[1] "periodo" "FECHA" "DESDE" "HASTA"
[5] "LINEA" "MOLINETE" "ESTACION" "pax_pagos"
[9] "pax_pases_pagos" "pax_franq" "pax_TOTAL"
df_2021 <- df_2021 %>%
mutate(ts = dmy_hms(paste0(FECHA," " ,DESDE)))
Warning: 1043613 failed to parse.
summary(df_2021)
periodo FECHA DESDE HASTA
Min. :2021 Length:8071680 Length:8071680 Length:8071680
1st Qu.:2021 Class :character Class :character Class :character
Median :2021 Mode :character Mode :character Mode :character
Mean :2021
3rd Qu.:2021
Max. :2021
LINEA MOLINETE ESTACION pax_pagos
Length:8071680 Length:8071680 Length:8071680 Min. : 0.00
Class :character Class :character Class :character 1st Qu.: 2.00
Mode :character Mode :character Mode :character Median : 7.00
Mean : 11.28
3rd Qu.: 15.00
Max. :292.00
pax_pases_pagos pax_franq pax_TOTAL
Min. :0.000000 Min. : 0.0000 Min. : 1.0
1st Qu.:0.000000 1st Qu.: 0.0000 1st Qu.: 3.0
Median :0.000000 Median : 0.0000 Median : 7.0
Mean :0.006266 Mean : 0.6143 Mean : 11.9
3rd Qu.:0.000000 3rd Qu.: 1.0000 3rd Qu.: 16.0
Max. :9.000000 Max. :54.0000 Max. :298.0
ts
Min. :2021-01-01 08:00:00.00
1st Qu.:2021-06-16 13:00:00.00
Median :2021-09-07 12:30:00.00
Mean :2021-08-20 11:27:10.45
3rd Qu.:2021-11-06 08:15:00.00
Max. :2021-12-31 22:00:00.00
NA's :1043613
summary(df_2022)
FECHA DESDE HASTA LINEA
Length:2468478 Length:2468478 Length:2468478 Length:2468478
Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character
MOLINETE ESTACION pax_pagos pax_pases_pagos
Length:2468478 Length:2468478 Min. : 0.00 Min. : 0.00000
Class :character Class :character 1st Qu.: 4.00 1st Qu.: 0.00000
Mode :character Mode :character Median : 12.00 Median : 0.00000
Mean : 18.72 Mean : 0.02097
3rd Qu.: 26.00 3rd Qu.: 0.00000
Max. :390.00 Max. :11.00000
pax_franq pax_TOTAL ts
Min. : 0.000 Min. : 1 Min. :2022-04-01 05:15:00.00
1st Qu.: 0.000 1st Qu.: 4 1st Qu.:2022-04-13 16:45:00.00
Median : 0.000 Median : 12 Median :2022-04-26 19:15:00.00
Mean : 0.917 Mean : 20 Mean :2022-04-28 17:18:53.03
3rd Qu.: 1.000 3rd Qu.: 27 3rd Qu.:2022-05-12 17:45:00.00
Max. :65.000 Max. :391 Max. :2022-05-31 23:30:00.00
NA's :851968
Grafico por línea y separo por estacion
df_linea <- df %>%
group_by(FECHA,DESDE,HASTA,LINEA,ESTACION) %>%
summarize(pasajeros = sum(pax_TOTAL)) %>%
mutate(ts = dmy_hms(paste0(FECHA," " ,DESDE)))%>%
ungroup() %>%
select(-c(FECHA, DESDE, HASTA))
`summarise()` has grouped output by 'FECHA', 'DESDE', 'HASTA', 'LINEA'. You can override using the `.groups` argument.
library(plotly)
library(htmlwidgets)
setwd("D:\\OneDrive\\Personal\\Maestria Data Mining\\Cuatrimestre_2\\Enfoque Estadístico del Aprendizaje\\TP\\EEA-TPs\\TP2\\tp2-EEA-2022\\output")
Warning: The working directory was changed to D:/OneDrive/Personal/Maestria Data Mining/Cuatrimestre_2/Enfoque Estadístico del Aprendizaje/TP/EEA-TPs/TP2/tp2-EEA-2022/output inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the working directory for notebook chunks.
dia <- c("2022-08-08","2022-08-09","2022-08-10","2022-08-11","2022-08-12","2022-08-13","2022-08-14")
dia_final <- c("2022-08-09","2022-08-10","2022-08-11","2022-08-12","2022-08-13","2022-08-14","2022-08-15")
nombre_dia <- c('Lunes','Martes','Miercoles','Jueves','Viernes','Sabado','Domingo')
i<- 1
lista_lineas <- c('LineaA','LineaB','LineaC','LineaD','LineaE','LineaH')
for (dia_inicial in dia){
print(i)
dia <- dia_inicial
dia1 <- dia_final[i]
nombre <- nombre_dia[i]
print(dia)
print(dia1)
print(nombre)
for (linea in lista_lineas){
p <- df_linea %>%
filter(ts > dia,ts<dia1,LINEA == linea) %>%
ggplot() + geom_line(aes(ts, pasajeros,color=ESTACION)) + ggtitle(paste('Pasajeros para el dia',nombre,'en',linea,sep=" ")+
scale_x_date(date_breaks = "1 hour", date_labels = "%H:%M") +
scale_x_date(date_minor_breaks = "15 minutes"))
print(ggplotly(p))
saveWidget(ggplotly(p), file = paste('Pasajeros para el dia',nombre,'en',linea,'.html',sep=" "));
}
i <- i+1
}
[1] 1
[1] "2022-08-08"
[1] "2022-08-09"
[1] "Lunes"
[1] 2
[1] "2022-08-09"
[1] "2022-08-10"
[1] "Martes"
[1] 3
[1] "2022-08-10"
[1] "2022-08-11"
[1] "Miercoles"
[1] 4
[1] "2022-08-11"
[1] "2022-08-12"
[1] "Jueves"
[1] 5
[1] "2022-08-12"
[1] "2022-08-13"
[1] "Viernes"
[1] 6
[1] "2022-08-13"
[1] "2022-08-14"
[1] "Sabado"
[1] 7
[1] "2022-08-14"
[1] "2022-08-15"
[1] "Domingo"
Agrupamos la información a nivel estación/horario y convertimos a formato fecha/hora:
df = df %>%
group_by(FECHA, DESDE, HASTA, LINEA, ESTACION) %>% # evaluar agrupar a nivel linea y quedarse con una única línea para el análisis
summarize(pasajeros = sum(pax_TOTAL)) %>%
mutate(ts = dmy_hms(paste0(FECHA," " ,DESDE))) %>%
ungroup() %>%
select(-c(FECHA, DESDE, HASTA)) ## ver si dejar esto o no
Agrupamos la información a nivel estación/horario y convertimos a formato fecha/hora:
df %>%
filter(ts > "2022-01-15" & ESTACION == "Florida") %>%
ggplot(aes(ts, pasajeros)) + geom_line()
Error in `filter()`:
! Problem while computing `..1 = ts > "2022-01-15" & ESTACION == "Florida"`.
Caused by error in `ts > "2022-01-15"`:
! comparison (6) is possible only for atomic and list types
Backtrace:
1. ... %>% ggplot(aes(ts, pasajeros))
4. dplyr:::filter.data.frame(., ts > "2022-01-15" & ESTACION == "Florida")
5. dplyr:::filter_rows(.data, ..., caller_env = caller_env())
6. dplyr:::filter_eval(dots, mask = mask, error_call = error_call)
8. mask$eval_all_filter(dots, env_filter)
df_analisis <- df_linea %>%
filter(ts > "2022-08-08" & ts < "2022-08-15" & ESTACION == "Palermo")
df_analisis_hora <- df_analisis %>%
mutate('Hora'=hour(ts),'Dia'=date(ts)) %>%
mutate('Hora'=paste0(Hora,':00'))
df_analisis_hora <- df_analisis_hora %>%
group_by(LINEA,ESTACION,Dia,Hora) %>%
summarise(pasajeros = sum(pasajeros))
`summarise()` has grouped output by 'LINEA', 'ESTACION', 'Dia'. You can override using the `.groups` argument.
df_analisis_hora <- df_analisis_hora %>%
mutate(ts = parse_date_time(paste0(Dia," ",Hora), "Ymd HMS", truncated = 3))
################################
df_analisis_hora_test <- df_linea %>%
filter(ts > "2022-08-15" & ts < "2022-09-01" & ESTACION == "Palermo")
df_analisis_hora_test <- df_analisis_hora_test %>%
mutate('Hora'=hour(ts),'Dia'=date(ts)) %>%
mutate('Hora'=paste0(Hora,':00'))
df_analisis_hora_test <- df_analisis_hora_test %>%
group_by(LINEA,ESTACION,Dia,Hora) %>%
summarise(pasajeros = sum(pasajeros))
`summarise()` has grouped output by 'LINEA', 'ESTACION', 'Dia'. You can override using the `.groups` argument.
df_analisis_hora_test <- df_analisis_hora_test %>%
mutate(ts = parse_date_time(paste0(Dia," ",Hora), "Ymd HMS", truncated = 3))
summary(df_linea)
LINEA ESTACION pasajeros ts
Length:196190 Length:196190 Min. : 1.0 Min. :2022-08-01 05:15:00.00
Class :character Class :character 1st Qu.: 31.0 1st Qu.:2022-08-08 17:45:00.00
Mode :character Mode :character Median : 73.0 Median :2022-08-16 15:15:00.00
Mean : 117.2 Mean :2022-08-16 13:15:56.01
3rd Qu.: 154.0 3rd Qu.:2022-08-24 10:00:00.00
Max. :4146.0 Max. :2022-08-31 23:30:00.00
p <- ggplot(df_analisis_hora,aes(ts, pasajeros)) + geom_line() + ggtitle('Pasajeros para Palermo en dos semanas')
ggplotly(p)
library(forecast)
auto_arima <- auto.arima(df_analisis_hora[,5])
summary(auto_arima)
Series: df_analisis_hora[, 5]
ARIMA(2,0,0) with non-zero mean
Coefficients:
ar1 ar2 mean
1.1508 -0.4620 803.9743
s.e. 0.0771 0.0775 68.5223
sigma^2 = 61146: log likelihood = -900.03
AIC=1808.07 AICc=1808.39 BIC=1819.54
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set -0.4661518 244.4081 194.2803 -101.9805 120.8804 0.903825 -0.03374763
valores_prediccion <- forecast(auto_arima,24)
autoplot(valores_prediccion)